# -*- coding: utf-8 -*-
""""
爬虫
2002.06.01 潘叶舟
弹出的第一个页面需要自己登陆然后按下G键登录
"""
import re
import os
import scrapy
import pandas as pd
from selenium import webdriver
from TianYanCha_MedicalCompany.settings import SOURCE
from TianYanCha_MedicalCompany.spiders.SeleniumAuxiliary import OpenDriver


class TianyanchaSpider(scrapy.Spider):

    name = 'tianyancha'
    start_urls = ['https://www.baidu.com/']
    # 初始化driver
    o = OpenDriver()
    o.login()  # 登陆
    o.run()    # 挂起获取cookie的程序

    def parse(self, response):
        """查询列表页数据"""

        ss = pd.read_excel('医药公司名称.xlsx')['公司名称']             # 循环取出公司名 并发送请求
        for i in ss:
            print(i)
            url = 'https://www.tianyancha.com/search?key=%s' % i        # 列表页查询地址
            yield scrapy.Request(url=url, callback=self.parse0, dont_filter=True)

    def parse0(self, response):
        """取到列表页第一条点进去"""

        html_str = response.text                                         # 获取源码

        url = re.findall(r'href="(https://www.tianyancha.'
                         r'com/company/\d+)"', html_str)                 # 匹配公司名列表
        if url:
            url = url[0]                                                 # 取第一个
            yield scrapy.Request(url=url, callback=self.parse1, dont_filter=True)              # 发送请求

    def parse1(self, response):
        """保存详情页"""

        html_str = response.text                                         # 获取源码
        grnt_corp_name = re.findall(
            '<h1 class="name">(.+?)</h1>', html_str)[0]                  # 匹配公司名
        os.makedirs(SOURCE + '\\' +
                    grnt_corp_name, exist_ok=True)
        f = open(SOURCE + '\\' +
                 grnt_corp_name + '\\' + 'main.html', 'w', encoding='utf-8')
        f.write(str(html_str))
        f.close()
        print('下载', grnt_corp_name)
        gid = re.findall('"gid":(\d+)', html_str)                         # 匹配参股控股关联的gid
        if gid:
            gid = gid[0]
            # print(grnt_corp_name, gid)
            affiliated_enterprise_url = 'https://www.tianyancha.com/pagination/holdingCompany.xhtml?' \
                                        'ps=9999&pn=1&id=%s&name=%s' \
                                        % (gid, grnt_corp_name)           # 拼接 参股控股url
            yield scrapy.Request(url=affiliated_enterprise_url,           # 发送请求
                                 callback=self.parse2, meta={'grnt_corp_name': grnt_corp_name})

    def parse2(self, response):
        """参股控股页"""

        html_str = response.text
        grnt_corp_name = response.meta['grnt_corp_name']
        os.makedirs(SOURCE + '\\' + grnt_corp_name, exist_ok=True)         # 创建路径
        f = open(SOURCE + '\\' + grnt_corp_name +
                 '\\' + 'affiliated.html', 'w', encoding='utf-8')
        f.write(str(html_str))
        f.close()
